{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 第17课：基于 CRF 的中文句法依存分析模型实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sklearn_crfsuite\n",
    "from sklearn_crfsuite import metrics\n",
    "from sklearn.externals import joblib\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dir= '../data/17/'\n",
    "class CorpusProcess(object):\n",
    "        def __init__(self):\n",
    "            \"\"\"初始化\"\"\"\n",
    "            self.train_process_path =  dir +  \"train.data\"   #预处理之后的训练集\n",
    "            self.test_process_path =  dir +  \"dev.data\"  #预处理之后的测试集\n",
    "            \n",
    "        def read_corpus_from_file(self, file_path):\n",
    "            \"\"\"读取语料\"\"\"\n",
    "            f = open(file_path, 'r',encoding='utf-8')\n",
    "            lines = f.readlines()\n",
    "            f.close()\n",
    "            return lines\n",
    "        \n",
    "        def write_corpus_to_file(self, data, file_path):\n",
    "            \"\"\"写语料\"\"\"\n",
    "            f = open(file_path, 'w')\n",
    "            f.write(str(data))\n",
    "            f.close()\n",
    "            \n",
    "        def process_sentence(self,lines):\n",
    "            \"\"\"处理句子\"\"\"\n",
    "            sentence = []\n",
    "            for line in lines:\n",
    "                if not line.strip():\n",
    "                    yield sentence\n",
    "                    sentence = []\n",
    "                else:\n",
    "                    lines = line.strip().split(u'\\t')\n",
    "                    result = [line for line in lines]\n",
    "                    sentence.append(result)   \n",
    "                    \n",
    "        def initialize(self):\n",
    "            \"\"\"语料初始化\"\"\"\n",
    "            train_lines = self.read_corpus_from_file(self.train_process_path)\n",
    "            test_lines = self.read_corpus_from_file(self.test_process_path)\n",
    "            self.train_sentences = [sentence for sentence in self.process_sentence(train_lines)]\n",
    "            self.test_sentences = [sentence for sentence in self.process_sentence(test_lines)] \n",
    "            \n",
    "        def generator(self, train=True):\n",
    "            \"\"\"特征生成器\"\"\"\n",
    "            if train: \n",
    "                sentences = self.train_sentences\n",
    "            else: \n",
    "                sentences = self.test_sentences\n",
    "            return self.extract_feature(sentences)\n",
    "        \n",
    "        def extract_feature(self, sentences):\n",
    "            \"\"\"提取特征\"\"\"\n",
    "            features, tags = [], []\n",
    "            for index in range(len(sentences)):\n",
    "                feature_list, tag_list = [], []\n",
    "                for i in range(len(sentences[index])):\n",
    "                    feature = {\"w0\": sentences[index][i][0],\n",
    "                               \"p0\": sentences[index][i][1],\n",
    "                               \"w-1\": sentences[index][i-1][0] if i != 0 else \"BOS\",\n",
    "                               \"w+1\": sentences[index][i+1][0] if i != len(sentences[index])-1 else \"EOS\",\n",
    "                               \"p-1\": sentences[index][i-1][1] if i != 0 else \"un\",\n",
    "                               \"p+1\": sentences[index][i+1][1] if i != len(sentences[index])-1 else \"un\"}\n",
    "                    feature[\"w-1:w0\"] = feature[\"w-1\"]+feature[\"w0\"]\n",
    "                    feature[\"w0:w+1\"] = feature[\"w0\"]+feature[\"w+1\"]\n",
    "                    feature[\"p-1:p0\"] = feature[\"p-1\"]+feature[\"p0\"]\n",
    "                    feature[\"p0:p+1\"] = feature[\"p0\"]+feature[\"p+1\"]\n",
    "                    feature[\"p-1:w0\"] = feature[\"p-1\"]+feature[\"w0\"]\n",
    "                    feature[\"w0:p+1\"] = feature[\"w0\"]+feature[\"p+1\"]\n",
    "                    feature_list.append(feature)\n",
    "                    tag_list.append(sentences[index][i][-1])\n",
    "                features.append(feature_list)\n",
    "                tags.append(tag_list)\n",
    "            return features, tags    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class ModelParser(object):\n",
    "    def __init__(self):\n",
    "            \"\"\"初始化参数\"\"\"\n",
    "            self.algorithm = \"lbfgs\"\n",
    "            self.c1 = 0.1\n",
    "            self.c2 = 0.1\n",
    "            self.max_iterations = 100\n",
    "            self.model_path = \"model.pkl\"\n",
    "            self.corpus = CorpusProcess()  #初始化CorpusProcess类\n",
    "            self.corpus.initialize()  #语料预处理\n",
    "            self.model = None\n",
    "            \n",
    "    def initialize_model(self):\n",
    "            \"\"\"模型初始化\"\"\"\n",
    "            algorithm = self.algorithm\n",
    "            c1 = float(self.c1)\n",
    "            c2 = float(self.c2)\n",
    "            max_iterations = int(self.max_iterations)\n",
    "            self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2,\n",
    "                                              max_iterations=max_iterations, all_possible_transitions=True)\n",
    "    def train(self):\n",
    "            \"\"\"训练\"\"\"\n",
    "            self.initialize_model()\n",
    "            x_train, y_train = self.corpus.generator()\n",
    "            self.model.fit(x_train, y_train)\n",
    "            labels = list(self.model.classes_)\n",
    "            x_test, y_test = self.corpus.generator(train=False)\n",
    "            y_predict = self.model.predict(x_test)\n",
    "            metrics.flat_f1_score(y_test, y_predict, average='weighted', labels=labels)\n",
    "            sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))\n",
    "            print(metrics.flat_classification_report(y_test, y_predict, labels=sorted_labels, digits=3))\n",
    "            self.save_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = ModelParser()\n",
    "model.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
